gprofiler for gene enrichment analysis

Read in DEGs

The DEGs are already ordered by smallest to largest adjusted p-value.

star_all  <- read.delim("../../results/star/DEGs/LBD_gene_DEGs_FDRq0.05.txt", 
                    header = TRUE, sep = "\t")
star_XX <- read.delim("../../results/star/DEGs/LBD_gene_DEGs_FDRq0.05_XX.txt", 
                    header = TRUE, sep = "\t")
star_XY <- read.delim("../../results/star/DEGs/LBD_gene_DEGs_FDRq0.05_XY.txt", 
                     header = TRUE, sep = "\t")

Subset lists

sort = FALSE so that the order of the DEGs remains. The DEGs are ordered by adjusted p-value May also order by log2FC

# sort by fold change. Biggest fold change is the first gene
star_all_sort_positiveFC <- star_XY[order(-star_all$adj.P.Val),]
# now repeat for negative fold changes
star_all_sort_negativeFC <- star_XY[order(star_all$adj.P.Val),]

star_XX_sort_positiveFC <- star_XX[order(-star_XX$adj.P.Val),]
star_XX_sort_negativeFC <- star_XX[order(star_XX$adj.P.Val),]

star_XY_sort_positiveFC <- star_XY[order(-star_XY$adj.P.Val),]
star_XY_sort_negativeFC <- star_XY[order(star_XY$adj.P.Val),]

# when subsetting, keep the order. I.e sort = FALSE. Do not resort 
up_star_all <- subset(star_all_sort_positiveFC$gene_name, 
                      star_all_sort_positiveFC$logFC > 0, sort = FALSE)
down_star_all <- subset(star_all_sort_negativeFC$gene_name, 
                        star_all_sort_negativeFC$logFC < 0, sort = FALSE)

up_star_XX <- subset(star_XX_sort_positiveFC$gene_name, 
                     star_XX_sort_positiveFC$logFC > 0, sort = FALSE)
down_star_XX <- subset(star_XX_sort_negativeFC$gene_name, 
                       star_XX_sort_negativeFC$logFC < 0, sort = FALSE)

up_star_XY <- subset(star_XY_sort_positiveFC$gene_name, 
                     star_XY_sort_positiveFC$logFC > 0, sort = FALSE)
down_star_XY <- subset(star_XY_sort_negativeFC$gene_name, 
                       star_XY_sort_negativeFC$logFC < 0, sort = FALSE)

All samples (XX and XY)

Enrichment analysis

ordered_query = TRUE specify that the genes are in a meaningful order. Ordered by adjusted p-value.

gp_up <- gost(up_star_all, ordered_query = TRUE, organism = "hsapiens")
gp_down <- gost(down_star_all, ordered_query = TRUE, organism = "hsapiens")

# inspect
head(gp_up$result)
##     query significant      p_value term_size query_size intersection_size
## 1 query_1        TRUE 0.0007260184      2428         31                15
## 2 query_1        TRUE 0.0017305449       736         24                 8
## 3 query_1        TRUE 0.0171855004       704         24                 7
## 4 query_1        TRUE 0.0171855004       704         24                 7
## 5 query_1        TRUE 0.0184989682       712         24                 7
## 6 query_1        TRUE 0.0387070894      4784         21                14
##   precision      recall    term_id source                            term_name
## 1 0.4838710 0.006177924 GO:0007399  GO:BP           nervous system development
## 2 0.3333333 0.010869565 GO:0099536  GO:BP                   synaptic signaling
## 3 0.2916667 0.009943182 GO:0098916  GO:BP anterograde trans-synaptic signaling
## 4 0.2916667 0.009943182 GO:0007268  GO:BP       chemical synaptic transmission
## 5 0.2916667 0.009831461 GO:0099537  GO:BP             trans-synaptic signaling
## 6 0.6666667 0.002926421 GO:0048731  GO:BP                   system development
##   effective_domain_size source_order                parents
## 1                 21029         3278             GO:0048731
## 2                 21029        22351             GO:0007267
## 3                 21029        22182             GO:0099537
## 4                 21029         3161             GO:0098916
## 5                 21029        22352             GO:0099536
## 6                 21029        15045 GO:0007275, GO:0048856
head(gp_down$result)
##     query significant     p_value term_size query_size intersection_size
## 1 query_1        TRUE 0.001085487        16          2                 2
## 2 query_1        TRUE 0.001560167       148        105                 8
## 3 query_1        TRUE 0.002288569        23          2                 2
## 4 query_1        TRUE 0.003672565        29          2                 2
## 5 query_1        TRUE 0.012465010         2         53                 2
## 6 query_1        TRUE 0.012465010        53          2                 2
##    precision     recall    term_id source
## 1 1.00000000 0.12500000 GO:0015671  GO:BP
## 2 0.07619048 0.05405405 GO:0002181  GO:BP
## 3 1.00000000 0.08695652 GO:0015669  GO:BP
## 4 1.00000000 0.06896552 GO:0042744  GO:BP
## 5 0.03773585 1.00000000 GO:0002752  GO:BP
## 6 1.00000000 0.03773585 GO:0042743  GO:BP
##                                                     term_name
## 1                                            oxygen transport
## 2                                     cytoplasmic translation
## 3                                               gas transport
## 4                         hydrogen peroxide catabolic process
## 5 cell surface pattern recognition receptor signaling pathway
## 6                         hydrogen peroxide metabolic process
##   effective_domain_size source_order                parents
## 1                 21029         5357             GO:0015669
## 2                 21029          850             GO:0006412
## 3                 21029         5355             GO:0006810
## 4                 21029        11933 GO:0042743, GO:0044248
## 5                 21029         1387 GO:0002220, GO:0002221
## 6                 21029        11932             GO:0072593
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

## png 
##   2
## png 
##   2

Plot up and down enrichment together

multi_gp = gost(list("down-regulated" = down_star_all, 
                     "up-regulated" = up_star_all), ordered_query = TRUE, 
                     organism = "hsapiens")

# rearrange so that up-regualted is on top 
neworder <- c("up-regulated","down-regulated")
multi_gp$result <- arrange(transform(multi_gp$result,
             query=factor(query,levels=neworder)),query)

# interactive plot 
gostplot(multi_gp, interactive = TRUE)

XX only

Enrichment analysis

#gp_up <- gost(up_star_XX, ordered_query = TRUE, organism = "hsapiens")
gp_down <- gost(down_star_XX, ordered_query = TRUE, organism = "hsapiens")

# inspect
#head(gp_up$result)
head(gp_down$result)
##     query significant    p_value term_size query_size intersection_size
## 1 query_1        TRUE 0.02240358         9          1                 1
## 2 query_1        TRUE 0.02240358         9          1                 1
## 3 query_1        TRUE 0.02240358         9          1                 1
##   precision    recall     term_id source                        term_name
## 1         1 0.1111111 HPA:0070000    HPA                        cartilage
## 2         1 0.1111111 HPA:0070111    HPA    cartilage; chondrocytes[≥Low]
## 3         1 0.1111111 HPA:0070112    HPA cartilage; chondrocytes[≥Medium]
##   effective_domain_size source_order     parents
## 1                 10976           86 HPA:0000000
## 2                 10976           87 HPA:0070000
## 3                 10976           88 HPA:0070111

Plot enrichment analysis

interactive = TRUE means the plot is interactive. You can select a point of interest to see what it is.
interactive = FALSE is needed when saving the plot capped = FALSE indicates if the Y-asix p-value should be capped at 16 or not.

Plot up and down enrichment together

## No results to show
## Please make sure that the organism or namespace is correct

XY only

Enrichment analysis

gp_up <- gost(up_star_XY, ordered_query = TRUE, organism = "hsapiens")
gp_down <- gost(down_star_XY, ordered_query = TRUE, organism = "hsapiens")

# inspect
head(gp_up$result)
##     query significant      p_value term_size query_size intersection_size
## 1 query_1        TRUE 7.951616e-04      2428         35                16
## 2 query_1        TRUE 1.493668e-02       736         31                 8
## 3 query_1        TRUE 3.207755e-02        52          3                 2
## 4 query_1        TRUE 2.918933e-05      1305         31                12
## 5 query_1        TRUE 8.451554e-05       633         53                11
## 6 query_1        TRUE 3.441771e-04      1347         43                13
##   precision      recall    term_id source                         term_name
## 1 0.4571429 0.006589786 GO:0007399  GO:BP        nervous system development
## 2 0.2580645 0.010869565 GO:0099536  GO:BP                synaptic signaling
## 3 0.6666667 0.038461538 GO:0048013  GO:BP ephrin receptor signaling pathway
## 4 0.3870968 0.009195402 GO:0045202  GO:CC                           synapse
## 5 0.2075472 0.017377567 GO:0030424  GO:CC                              axon
## 6 0.3023256 0.009651076 GO:0043005  GO:CC                 neuron projection
##   effective_domain_size source_order    parents
## 1                 21029         3278 GO:0048731
## 2                 21029        22351 GO:0007267
## 3                 21029        14506 GO:0007169
## 4                 21672         2443 GO:0030054
## 5                 21672         1098 GO:0043005
## 6                 21672         2090 GO:0120025
head(gp_down$result)
##     query significant      p_value term_size query_size intersection_size
## 1 query_1        TRUE 0.0006647771       148         94                 8
## 2 query_1        TRUE 0.0010854871        16          2                 2
## 3 query_1        TRUE 0.0022885685        23          2                 2
## 4 query_1        TRUE 0.0036725645        29          2                 2
## 5 query_1        TRUE 0.0106377731         2         49                 2
## 6 query_1        TRUE 0.0124650097        53          2                 2
##    precision     recall    term_id source
## 1 0.08510638 0.05405405 GO:0002181  GO:BP
## 2 1.00000000 0.12500000 GO:0015671  GO:BP
## 3 1.00000000 0.08695652 GO:0015669  GO:BP
## 4 1.00000000 0.06896552 GO:0042744  GO:BP
## 5 0.04081633 1.00000000 GO:0002752  GO:BP
## 6 1.00000000 0.03773585 GO:0042743  GO:BP
##                                                     term_name
## 1                                     cytoplasmic translation
## 2                                            oxygen transport
## 3                                               gas transport
## 4                         hydrogen peroxide catabolic process
## 5 cell surface pattern recognition receptor signaling pathway
## 6                         hydrogen peroxide metabolic process
##   effective_domain_size source_order                parents
## 1                 21029          850             GO:0006412
## 2                 21029         5357             GO:0015669
## 3                 21029         5355             GO:0006810
## 4                 21029        11933 GO:0042743, GO:0044248
## 5                 21029         1387 GO:0002220, GO:0002221
## 6                 21029        11932             GO:0072593

Plot enrichment analysis

## png 
##   2
## png 
##   2

Plot up and down enrichment together

multi_gp = gost(list("down-regulated" = down_star_XY, 
                     "up-regulated" = up_star_XY), ordered_query = TRUE, 
                     organism = "hsapiens")

# rearrange so that up-regualted is on top 
neworder <- c("up-regulated","down-regulated")
multi_gp$result <- arrange(transform(multi_gp$result,
             query=factor(query,levels=neworder)),query)

# interactive plot 
gostplot(multi_gp, interactive = TRUE)